import pandas as pd
import numpy as np
def precission(y_hat, y_true):
tp = np.sum(y_true[y_hat == 1])
tot_p = np.sum(y_hat)
return tp / tot_p
def recall(y_hat, y_true):
tp = np.sum(y_true[y_hat == 1])
fn = np.sum(y_true[y_hat==0])
return tp / (tp + fn)
ds = pd.read_csv('./radar_points.txt')
ds.seq = ds.seq.astype(int)
ds.object_label = ds.object_label.astype(int)
ds.head()
cross_section - можно думать, что это логарифм площади
relative_lateral_velocity - скорость относительно радара, радиальная ее часть
absolute_radial_velocity - скорость относительно мира
belongs_to_object - попала ли точка по нашему мнению на машину
y_hat = np.zeros(len(ds.values))
#dummy_mask = TODO
dummy_mask = ds.absolute_radial_velocity > 2
y_hat[dummy_mask] = 1
print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))
Машины отражают по несколько точек
dummy_cars = ds[dummy_mask][['x', 'y']].values
from sklearn.neighbors import NearestNeighbors
#TODO
index = NearestNeighbors()
index.fit(dummy_cars)
d, _ = index.kneighbors(ds[['x', 'y']], n_neighbors=2)
y_hat = np.zeros(len(ds.values))
y_hat[(d < 2.0)[:, 0]] = 1.0
print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))
import matplotlib.pyplot as plt
from IPython import display
import time
def visualize_ds(ds):
ax = plt.gca()
for seq in sorted(ds.seq.unique()):
ax.clear()
scene = ds[ds.seq == seq]
ax.set_title(seq)
plt.scatter(scene.x, scene.y, s=5, c=scene.relative_radial_velocity)
ax.set_xlim((440,500))
ax.set_ylim((552,582))
display.clear_output(wait=True)
display.display(plt.gcf())
time.sleep(0.01)
display.clear_output()
visualize_ds(ds)
ds = ds.sort_values(by=['seq'])
seqs = ds.seq.unique()
nseqs = len(seqs)
train_part = 0.5
val_part = 0.3
train_end = int(train_part * nseqs)
val_end = train_end + int(val_part * nseqs)
train_seqs = seqs[:train_end]
val_seqs = seqs[train_end:val_end]
test_seqs = seqs[val_end:]
train_ds = ds[np.isin(ds['seq'], train_seqs)]
val_ds = ds[np.isin(ds['seq'], val_seqs)]
test_ds = ds[np.isin(ds['seq'], test_seqs)]
COL_DROPS = ['stamp', 'id', 'seq', 'object_label', 'belongs_to_object']
X_train = train_ds.drop(COL_DROPS, axis=1)
y_train = train_ds['belongs_to_object'].astype('int32')
X_val = val_ds.drop(COL_DROPS, axis=1)
y_val = val_ds['belongs_to_object'].astype('int32')
X_test = test_ds.drop(COL_DROPS, axis=1)
y_test = test_ds['belongs_to_object'].astype('int32')
X_train.head()
len(X_train), len(X_val), len(X_test), len(ds)
visualize_ds(train_ds)
import catboost
def learn(X_train, X_val, y_train, y_val):
pool = catboost.Pool(X_train, y_train)
clf = catboost.CatBoostClassifier(
custom_loss=['AUC', 'Accuracy'],
n_estimators=1000,
depth=6,
learning_rate=1e-2,
rsm=0.7,
l2_leaf_reg=1.5,
)
clf.fit(
pool,
early_stopping_rounds=100,
use_best_model=True,
eval_set=(X_val.values, y_val.values),
plot=True,
verbose=False,
)
return clf
cls = learn(X_train, X_val, y_train, y_val)
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
import plotly
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm
def test_one(clf, X_test, y_test):
y_test_hat = clf.predict_proba(X_test)
pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
return pr[ix], rec[ix], thr[ix - 1]
# def heuristic_filter_scoring():
# pr = []
# rec = []
# filter_range = list(range(1, 10))
# for i in filter_range:
# y_test_heuristic_hat = np.ones(len(heu_X_test))
# y_test_heuristic_hat[filter_by_intensity(heu_X_test.intensity, i)] = 0
# pr.append(precision_score(heu_y_test, y_test_heuristic_hat))
# rec.append(recall_score(heu_y_test, y_test_heuristic_hat))
# return pr, rec, filter_range
# pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()
def plot_pr_rec(*models):
traces = []
for model, clf, X_test, y_test in models:
pr, rec, thr = test_one(clf, X_test, y_test)
pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=model)
traces.append(pr_rec)
# pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=thr_bl, name='Intensity BL')
layout = go.Layout(
title='Precission-recall',
xaxis=dict(
title='Recall'
),
yaxis=dict(
title='Precission'
))
fig = go.Figure(
# data=traces + [pr_rec_bl],
data=traces,
layout=layout)
py.iplot(fig)
models = [('my classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
from sklearn.neighbors import KDTree
class ComputeFeatures(object):
def __init__(self):
self.r = [1.0]
self.base_names = [
'relative_radial_velocity',
'relative_lateral_velocity',
'cross_section',
'distance_rms',
'angle_rms',
'radial_velocity_rms',
'is_cylindrical',
'absolute_radial_velocity']
self.r_names = [
'region_npoints',
'avg_absolute_radial_velocity',
'median_absolute_radial_velocity']
def _feature_names(self):
names = []
names += self.base_names
for r in self.r:
names += list(map(lambda n: '%s_%s' % (n, r), self.r_names))
return names
def compute_point_features(self, point_id):
point_features = []
for n in self.base_names:
f = self.seq_ds[n].values[point_id]
point_features.append(f)
for r in self.r:
neighbours = self.get_point_neighbours(point_id, r)
r_features = []
for f in self.r_names:
if f.statswith('avg_'):
f = f[4:]
vals = self.seq_ds[[f]].values[neighbours]
v = np.average(vals)
r_features.append(v)
elif f.startswith('median_'):
f = f[7:]
vals = self.seq_ds[[f]].values[neighbours]
v = np.median(vals)
r_features.append(v)
elif f == 'region_npoints':
r_features.append(len(neighbours))
else:
raise 'Unknown feature'
point_features += r_features
return point_features
def get_point_neighbours(self, point_id, r):
return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=r)[0]
def __call__(self, seq_ds):
self.xyz = seq_ds[['x', 'y', 'z']].values[:]
self.seq_ds = seq_ds
self.index = KDTree(self.xyz)
features = []
for point_id in range(len(self.xyz)):
features.append(self.compute_point_features(point_id))
return pd.DataFrame(columns=self._feature_names(), data=features)
from tqdm import tqdm
import os
f_computer = ComputeFeatures()
features = []
for seq in tqdm(seqs):
file_name = './features/' + str(seq) + '.csv'
if not os.path.isfile(file_name):
seq_ds = ds[ds['seq'] == seq]
seq_f = f_computer(seq_ds)
seq_f.to_csv(file_name)
features.append(f_computer(seq_ds))
else:
features.append(pd.read_csv(file_name))
features[0].head()
X_train = features[:train_end]
X_train = pd.concat(X_train)
X_val = features[train_end:val_end]
X_val = pd.concat(X_val)
X_test = features[val_end:]
X_test = pd.concat(X_test)
import catboost
def learn(X_train, X_val, y_train, y_val):
pool = catboost.Pool(X_train, y_train)
clf = catboost.CatBoostClassifier(
custom_loss=['AUC', 'Accuracy'],
n_estimators=1000,
depth=6,
learning_rate=1e-2,
rsm=0.1,
l2_leaf_reg=2,
)
clf.fit(
pool,
early_stopping_rounds=100,
use_best_model=True,
eval_set=(X_val.values, y_val.values),
plot=True,
verbose=False,
)
return clf
cls = learn(X_train, X_val, y_train, y_val)
models = [('my classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
Well, a bit better result that when using heuristics, though not that much
THR = 0.273
y_hat = cls.predict_proba(X_test)[:, 1]
y_hat[y_hat > THR] = 1
y_hat[y_hat <= THR] = 0
print('precission', precission(y_hat, y_test))
print('recall', recall(y_hat, y_test))